Next steps

# Load libraries
library(RCurl)
library(tidyverse)
library(magrittr)
library(plotly)
library(imputeTS)
library(lubridate)
library(timetk)
library(ggcorrplot)
# Load data (relative file path) and view contents
df <- read.csv("../data/energy_dataset.csv")
head(df)
dim(df) # Number of rows and columns
## [1] 35064    29
summary(df) # Distribution and missingness for each variable
##      time           generation.biomass generation.fossil.brown.coal.lignite
##  Length:35064       Min.   :  0.0      Min.   :  0.0                       
##  Class :character   1st Qu.:333.0      1st Qu.:  0.0                       
##  Mode  :character   Median :367.0      Median :509.0                       
##                     Mean   :383.5      Mean   :448.1                       
##                     3rd Qu.:433.0      3rd Qu.:757.0                       
##                     Max.   :592.0      Max.   :999.0                       
##                     NA's   :19         NA's   :18                          
##  generation.fossil.coal.derived.gas generation.fossil.gas
##  Min.   :0                          Min.   :    0        
##  1st Qu.:0                          1st Qu.: 4126        
##  Median :0                          Median : 4969        
##  Mean   :0                          Mean   : 5623        
##  3rd Qu.:0                          3rd Qu.: 6429        
##  Max.   :0                          Max.   :20034        
##  NA's   :18                         NA's   :18           
##  generation.fossil.hard.coal generation.fossil.oil generation.fossil.oil.shale
##  Min.   :   0                Min.   :  0.0         Min.   :0                  
##  1st Qu.:2527                1st Qu.:263.0         1st Qu.:0                  
##  Median :4474                Median :300.0         Median :0                  
##  Mean   :4256                Mean   :298.3         Mean   :0                  
##  3rd Qu.:5839                3rd Qu.:330.0         3rd Qu.:0                  
##  Max.   :8359                Max.   :449.0         Max.   :0                  
##  NA's   :18                  NA's   :19            NA's   :18                 
##  generation.fossil.peat generation.geothermal
##  Min.   :0              Min.   :0            
##  1st Qu.:0              1st Qu.:0            
##  Median :0              Median :0            
##  Mean   :0              Mean   :0            
##  3rd Qu.:0              3rd Qu.:0            
##  Max.   :0              Max.   :0            
##  NA's   :18             NA's   :18           
##  generation.hydro.pumped.storage.aggregated
##  Mode:logical                              
##  NA's:35064                                
##                                            
##                                            
##                                            
##                                            
##                                            
##  generation.hydro.pumped.storage.consumption
##  Min.   :   0.0                             
##  1st Qu.:   0.0                             
##  Median :  68.0                             
##  Mean   : 475.6                             
##  3rd Qu.: 616.0                             
##  Max.   :4523.0                             
##  NA's   :19                                 
##  generation.hydro.run.of.river.and.poundage generation.hydro.water.reservoir
##  Min.   :   0.0                             Min.   :   0                    
##  1st Qu.: 637.0                             1st Qu.:1077                    
##  Median : 906.0                             Median :2164                    
##  Mean   : 972.1                             Mean   :2605                    
##  3rd Qu.:1250.0                             3rd Qu.:3757                    
##  Max.   :2000.0                             Max.   :9728                    
##  NA's   :19                                 NA's   :18                      
##  generation.marine generation.nuclear generation.other
##  Min.   :0         Min.   :   0       Min.   :  0.00  
##  1st Qu.:0         1st Qu.:5760       1st Qu.: 53.00  
##  Median :0         Median :6566       Median : 57.00  
##  Mean   :0         Mean   :6264       Mean   : 60.23  
##  3rd Qu.:0         3rd Qu.:7025       3rd Qu.: 80.00  
##  Max.   :0         Max.   :7117       Max.   :106.00  
##  NA's   :19        NA's   :17         NA's   :18      
##  generation.other.renewable generation.solar generation.waste
##  Min.   :  0.00             Min.   :   0     Min.   :  0.0   
##  1st Qu.: 73.00             1st Qu.:  71     1st Qu.:240.0   
##  Median : 88.00             Median : 616     Median :279.0   
##  Mean   : 85.64             Mean   :1433     Mean   :269.5   
##  3rd Qu.: 97.00             3rd Qu.:2578     3rd Qu.:310.0   
##  Max.   :119.00             Max.   :5792     Max.   :357.0   
##  NA's   :18                 NA's   :18       NA's   :19      
##  generation.wind.offshore generation.wind.onshore forecast.solar.day.ahead
##  Min.   :0                Min.   :    0           Min.   :   0            
##  1st Qu.:0                1st Qu.: 2933           1st Qu.:  69            
##  Median :0                Median : 4849           Median : 576            
##  Mean   :0                Mean   : 5464           Mean   :1439            
##  3rd Qu.:0                3rd Qu.: 7398           3rd Qu.:2636            
##  Max.   :0                Max.   :17436           Max.   :5836            
##  NA's   :18               NA's   :18                                      
##  forecast.wind.offshore.eday.ahead forecast.wind.onshore.day.ahead
##  Mode:logical                      Min.   :  237                  
##  NA's:35064                        1st Qu.: 2979                  
##                                    Median : 4855                  
##                                    Mean   : 5471                  
##                                    3rd Qu.: 7353                  
##                                    Max.   :17430                  
##                                                                   
##  total.load.forecast total.load.actual price.day.ahead   price.actual   
##  Min.   :18105       Min.   :18041     Min.   :  2.06   Min.   :  9.33  
##  1st Qu.:24794       1st Qu.:24808     1st Qu.: 41.49   1st Qu.: 49.35  
##  Median :28906       Median :28901     Median : 50.52   Median : 58.02  
##  Mean   :28712       Mean   :28697     Mean   : 49.87   Mean   : 57.88  
##  3rd Qu.:32263       3rd Qu.:32192     3rd Qu.: 60.53   3rd Qu.: 68.01  
##  Max.   :41390       Max.   :41015     Max.   :101.99   Max.   :116.80  
##                      NA's   :36
#describe(df)

Missing data

There are eight columns that contain no values (all NAs or zeroes). These are removed here.

df_clean <- df %>% 
  select(-c(generation.fossil.coal.derived.gas,
            generation.fossil.oil.shale,
            generation.fossil.peat,
            generation.geothermal,
            generation.marine,
            generation.hydro.pumped.storage.aggregated,
            generation.wind.offshore,
            forecast.wind.offshore.eday.ahead))
colSums(is.na(df_clean)) # Number of missing values for each remaining variable
##                                        time 
##                                           0 
##                          generation.biomass 
##                                          19 
##        generation.fossil.brown.coal.lignite 
##                                          18 
##                       generation.fossil.gas 
##                                          18 
##                 generation.fossil.hard.coal 
##                                          18 
##                       generation.fossil.oil 
##                                          19 
## generation.hydro.pumped.storage.consumption 
##                                          19 
##  generation.hydro.run.of.river.and.poundage 
##                                          19 
##            generation.hydro.water.reservoir 
##                                          18 
##                          generation.nuclear 
##                                          17 
##                            generation.other 
##                                          18 
##                  generation.other.renewable 
##                                          18 
##                            generation.solar 
##                                          18 
##                            generation.waste 
##                                          19 
##                     generation.wind.onshore 
##                                          18 
##                    forecast.solar.day.ahead 
##                                           0 
##             forecast.wind.onshore.day.ahead 
##                                           0 
##                         total.load.forecast 
##                                           0 
##                           total.load.actual 
##                                          36 
##                             price.day.ahead 
##                                           0 
##                                price.actual 
##                                           0

16 out of 22 of the remaining variables each have a maximum of 36 missing values. Looking at where these occur, we see that most missing values occur across variables. Specifically, 90% of the missing values occur over only 18 rows. The remaining 10% are spread across 28 rows.

# Rows where data is missing
df_clean[!complete.cases(df_clean),]
# Number of NAs per row
missing_per_row <- rowSums(is.na(df_clean[!complete.cases(df_clean),]))
missing_per_row
##   100   109   110   111   112   113   114   452   453   644   662   752   753 
##    14    15    15    15    15    15    15    14    14    14    15     1     1 
##   754   757   758   759   760   761   762   763   764  2259  2529  2624  2709 
##     1     1     1     1     1     1     1     1     1     1    15     1    15 
##  2914  3555  3969  6584  6587  8050 11237 11525 11527 11903 12673 13342 13392 
##     1     1    14     1    15    15     1     1     1     1     1    14     1 
## 15273 15983 16613 25165 25172 30186 30897 
##     1     1     1     1     1     1    15
sum(missing_per_row > 1)
## [1] 18
sum(missing_per_row[missing_per_row > 1])/sum(missing_per_row)
## [1] 0.9041096

What are we left with?

35,000 obs across 20 variables, hourly from start 2015 to end 2018:

And in the other dataset, not yet loaded: hourly weather data over the same period for a number of Spanish cities

Plotting

# Formatting time variable to datetime object
# NB:  Time contains both UTC+1 and UTC+2 time-zones - date functions automatically pick this up and convert to UTC

df_clean$time <- as_datetime(df_clean$time)
head(df_clean)

Focusing on generation variables

gen_data <- df_clean %>% 
  select(c(1, starts_with("generation")))
# Converting to long form
long_gen_data <- gen_data  %>% 
  pivot_longer(cols = -time, names_to = "generation_source") %>% 
  mutate(generation_source = str_replace(generation_source, "generation.", ""))

head(long_gen_data)

Plot distribution of (generation) data missingness

# NB: MAYBE DON'T DO WHOLE DATASET LOCALLY, IT CRASHED MY R SESSION
long_gen_data_small <- long_gen_data[1:10000,]
ggplot_na_distribution(x=long_gen_data_small$value, y=long_gen_data_small$time)

# Interpolate missing values
df_clean %<>% mutate(across(generation.biomass:price.actual, .fns = na_interpolation, option = 'linear')) %>% 
  select(c(1, starts_with("generation")))  
long_gen_data <- df_clean %>% 
  pivot_longer(cols = -time, names_to = "generation_source") %>% 
  mutate(generation_source = str_replace(generation_source, "generation.", ""))

# Verify
long_gen_data[!complete.cases(long_gen_data),]

Getting statistics over entire dataset for each source

overall_stats <- long_gen_data %>%
  group_by(generation_source) %>%
  summarise(sum = sum(value, na.rm = TRUE),
            mean = mean(value, na.rm = TRUE),
            sd = sd(value, na.rm = TRUE),
            max = max(value, na.rm = TRUE)) %>%
  mutate(generation_source= fct_reorder(generation_source, desc(sum)))

sources_by_sum <- arrange(overall_stats, desc(sum))
sources_by_sum
ggplot(data = overall_stats) + geom_col(aes(x = generation_source, y = sum)) + theme(axis.text.x = element_text(angle=90))

sources_by_sum %>% pivot_longer(cols = c(sum, mean, sd, max), names_to = "stat", 
                                values_to = "value") %>%
  filter(stat != "sum") %>%
  ggplot() + geom_col(aes(x = generation_source, y = value, fill = stat), position = "dodge") + theme(axis.text.x = element_text(angle=90))

Using this information to re-factor the generation_source variable, so that plots will order the sources from biggest to smallest

long_gen_data$generation_source <- factor(long_gen_data$generation_source, levels =  sources_by_sum$generation_source)

Smoothing the data on a daily scale

gen_data_smoothed <- long_gen_data %>% 
  group_by(generation_source) %>% 
  summarise_by_time(time, .by = 'day', adjusted = mean(value)) 

head(gen_data_smoothed)

Looking at four series over time in isolation, with quarterly smoothing (This function is nice as it includes an interactive feature)

gen_data_smoothed %>% 
  filter(generation_source %in% sources_by_sum$generation_source[1:6]) %>% 
  plot_time_series(time, adjusted, .facet_ncol = 3,
                   .smooth_period = "3 months",
                   .title= "Top generation sources over time, with 3-month mean smoothing")

Looking at all generation sources together on an area chart

Seasonality within a day
hourly_generation_plot_by_year <- long_gen_data %>%
  ggplot(aes(x = hour, y = value, fill = generation_source)) +
  geom_bar(stat = "identity")

hourly_generation_plot_by_year

# Proportions
hourly_generation_plot_by_year_props <- long_gen_data_props %>%
  ggplot(aes(x = hour, y = value, fill = generation_source)) +
  geom_bar(stat = "identity")

hourly_generation_plot_by_year_props

Seasonality within a month
daily_generation_plot_by_year <- long_gen_data %>%
  ggplot(aes(x = day, y = value, fill = generation_source)) +
  geom_bar(stat = "identity") +
  facet_wrap(~year) + theme(legend.position = "none") 

daily_generation_plot_by_year

# Note: Similar problem to raw values
daily_generation_plot_by_year_props <- long_gen_data_props %>%
  ggplot(aes(x = day, y = value, fill = generation_source)) +
  geom_bar(stat = "identity") +
  facet_wrap(~year) + theme(legend.position = "none") 

daily_generation_plot_by_year_props

Seasonality within a year
monthly_generation_plot <- long_gen_data %>%
  ggplot(aes(x = month, y = value, fill = generation_source)) +
  geom_bar(stat = "identity") +
  facet_wrap(~year)

monthly_generation_plot

# Note: Shorter bars for shorter months as above
monthly_generation_plot_props <- long_gen_data_props %>%
  ggplot(aes(x = month, y = value, fill = generation_source)) +
  geom_bar(stat = "identity") +
  facet_wrap(~year)

monthly_generation_plot_props

```